Reverse-engineering the brain
mnist_data = getOMLDataSet(did = 554)$data # MNIST dataset from OpenML
par( mfrow = c(10,10), mai = c(0,0,0,0)) # 10x10 grid
for(i in 1:100){ # Convert first 100 rows to 28x28 matrix and plot
y = matrix(as.matrix(mnist_data[i, 1:784]), nrow=28)
image( y[,nrow(y):1], axes = FALSE, col = gray(255:0 / 255))}
perceptron()
px <- seq(-6, 6, length.out = 101)
plot(px, px>=0, type = "l", xlab = "", ylab = "", main = "Perceptron response")
mnist_bin = droplevels(subset(mnist_data, class %in% c(0,1))) #MNIST with only 0 and 1 (binary)
# Now, plot output for 2 random pixels
ggplot(data=mnist_bin, aes(x=pixel263, y=pixel518, color=class)) + geom_point()
mnist_task = makeClassifTask(data = mnist_bin, target = "class")
# We're mimicking a perceptron by constructing a neural net with 1 hidden node
# Actually this is a sigmoid perceptron, but still quite close
perceptron = makeLearner("classif.nnet", par.vals = list(size = 1, trace = FALSE))
plotLearnerPrediction(perceptron, mnist_task, features = c("pixel263", "pixel518")) + theme_cowplot()
ws = seq(0,4,len=20) # for plotting
f = function(w) { (w-2)^2 } # function to optimize
plot(ws , f(ws), type="l") # Plot target
w = c(0.1,0.1-0.1*2*(0.1-2)) # We'll see later how to compute this
lines (w, f(w), type="b",col="blue")
grad = function(w){2*(w-2)} # gradient df/dw
w = 0.1 # initialize (first guess)
learningRate = 0.8 # try smaller and larger values
wtrace = w # initialize (first guess)
ftrace = f(w) # store y-values
for (step in 1:100) {
w = w - learningRate*grad(w) # gradient descent update
wtrace = c(wtrace,w) # store next x-value
ftrace = c(ftrace,f(w)) # store next y-value
}
plot(ws , f(ws), type="l") # Plot target
lines( wtrace , ftrace , type="b",col="blue") # Plot steps
# Print the gradient descent trace. Note that the implementation does multiple restarts.
perceptron = makeLearner("classif.nnet", par.vals = list(size = 1, trace = TRUE))
plotLearnerPrediction(perceptron, mnist_task, cv = 0, features = c("pixel263", "pixel518")) + theme_cowplot()
# weights: 5 initial value 9088.225535 iter 10 value 3424.542088 iter 20 value 3362.401146 final value 3362.376301 converged
library(pracma)
plot(px, sigmoid(px, a = 1), type = "l",
ylab = "spiking frequency (normalized)", xlab = "amount of stimulation", main = "Sigmoid")
Let's start with a simpler dataset
library(mlbench)
spirals = as.data.frame(mlbench.spirals(n=1000, cycles=1.5, sd=0.05))
spiral_task = makeClassifTask(data = spirals, target = "classes")
ggplot(data=spirals, aes(x=x.1, y=x.2, color=classes)) + geom_point() + coord_fixed(ratio=1)
The perceptron can't learn it
# The perceptron again
mlp = makeLearner("classif.nnet", par.vals = list(size = 1, trace = FALSE))
plotLearnerPrediction(mlp, spiral_task, features = c("x.1", "x.2")) + theme_cowplot()
We can build a neural net, with 1 layer of hidden nodes
library(nnet)
mod1<-nnet(rand.vars,resp,data=spirals,size=10,trace=F)
plot.nnet(mod1)
# 1 hidden layer, 10 nodes
mlp = makeLearner("classif.nnet", par.vals = list(size = 10, trace = FALSE))
plotLearnerPrediction(mlp, spiral_task, features = c("x.1", "x.2")) + theme_cowplot()
# 1 hidden layer, 100 nodes
mlp = makeLearner("classif.nnet", par.vals = list(size = 100, trace = FALSE))
plotLearnerPrediction(mlp, spiral_task, features = c("x.1", "x.2")) + theme_cowplot()
plot(px, sigmoid(px, a = 1), type = "l",
ylab = "Output", xlab = "Weighted sum of inputs (u)", main = "Sigmoid")
Nice property: $$\frac{\delta \sigma(u)}{\delta u} = \sigma(u)(1-\sigma(u))$$

For hidden node $h$, one layer before nodes $k$: \begin{align} \frac{\delta E}{\delta uh} = \sum{k \in outs(h)} \frac{\delta E}{\delta uk} \frac{\delta u_k}{\delta u_h} \quad(\text{sum rule + chain rule})\ & = \sum{k \in outs(h)} -\deltak \frac{\delta u_k}{\delta u_h} \ & = \sum{k \in outs(h)} -\deltak \frac{\delta u_k}{\delta o_h} \frac{\delta o_h}{\delta u_h} \ & = \sum{k \in outs(h)} -\deltak w{k,h} \frac{\delta oh}{\delta u_h} \ & = \sum{k \in outs(h)} -\deltak w{k,h} o_h(1-o_h) \ & \end{align}
$$\delta_h = o_h(1-o_h) \sum_{k \in outs(h)} \delta_k w_{k,h} $$mod1<-nnet(rand.vars,resp,data=spirals,size=10,trace=F)
plot.nnet(mod1)


Stacked sparse autoencoders encode increasingly complex features

mlp = makeLearner("classif.nnet", par.vals = list(size = 200, trace = FALSE))
plotLearnerPrediction(mlp, mnist_task, features = c("pixel217", "pixel518")) + theme_cowplot()
Thanks to Bernd Bischl and Tobias Glasmachers for useful input.